#!/bin/bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 

if [ $# -ne 2 ] 
then
    echo "parse_index <crawlDir> <solrURL>"
    exit -1
fi

CRAWL_PATH="$1"
SOLRURL="$2"

# determines whether mode based on presence of job file
# resolve links - $0 may be a softlink
THIS="$0"
while [ -h "$THIS" ]; do
  ls=`ls -ld "$THIS"`
  link=`expr "$ls" : '.*-> \(.*\)$'`
  if expr "$link" : '.*/.*' > /dev/null; then
    THIS="$link"
  else
    THIS=`dirname "$THIS"`/"$link"
  fi
done
THIS_DIR=`dirname "$THIS"`
NUTCH_HOME=`cd "$THIS_DIR/.." ; pwd`

mode=local
if [ -f ${NUTCH_HOME}/*nutch*.job ]; then
    mode=distributed
    echo "Running in dsitributed mode"
fi

bin=`dirname "$0"`
bin=`cd "$bin"; pwd`

# note that some of the options listed here could be set in the 
# corresponding hadoop site xml param file 
commonOptions="-D mapred.reduce.tasks=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"

 # check that hadoop can be found on the path 
if [ $mode = "distributed" ]; then
 if [ $(which hadoop | wc -l ) -eq 0 ]; then
    echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
    exit -1;
 fi
fi


for SEGMENT in $CRAWL_PATH/segments/*; do

  # parsing the segment
  echo "Parsing : $SEGMENT"
	
  rm -rf $SEGMENT/crawl_parse
  # enable the skipping of records for the parsing so that a dodgy document 
  # so that it does not fail the full task
  skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
  $bin/nutch parse $commonOptions $skipRecordsOptions $SEGMENT

  rc=$?
  if [ $rc -ne 0 ] 
  then 
	exit $rc 
  fi

  
# note that the link inversion - indexing routine can be done within the main loop 
# on a per segment basis
  echo "Link inversion"
  $bin/nutch invertlinks $CRAWL_PATH/linkdb $SEGMENT

  rc=$?
  if [ $rc -ne 0 ] 
  then 
	exit $rc 
  fi

  echo "Indexing $SEGMENT on SOLR index -> $SOLRURL"
  $bin/nutch solrindex $SOLRURL $CRAWL_PATH/crawldb -linkdb $CRAWL_PATH/linkdb $SEGMENT
  
  rc=$?
  if [ $rc -ne 0 ] 
  then 
	exit $rc 
  fi


  echo "SOLR dedup -> $SOLRURL"
  $bin/nutch solrdedup $SOLRURL
  
  rc=$?
  if [ $rc -ne 0 ] 
  then 
	exit $rc 
  fi


done

exit 0

